# ==============================================================================
# recoding-main.R
# author: Anselm Hager / Bernhard Clemm
# ==============================================================================

dir <- dirname(dirname(rstudioapi::getSourceEditorContext()$path))
source(paste0(dir, "/code/setup-packages.R"))

# DATA =========================================================================

btw <- read.csv(paste0(dir, "/data/votes/btw_merge.csv")) %>%
  dplyr::rename(date_attack = date_first_attack) %>%
  mutate(ags = ifelse(nchar(ags) == 7, paste0("0", ags), ags)) %>%
  mutate(date_election = as.Date(date_election, format = "%Y-%m-%d"))

ltw <- read.csv(paste0(dir, "/data/votes/ltw_merge.csv")) %>%
  dplyr::rename(date_attack = date_first_attack) %>%
  mutate(ags = ifelse(nchar(ags) == 7, paste0("0", ags), ags)) %>%
  mutate(date_election = as.Date(date_election, format = "%Y-%m-%d"))

kw <- read.csv(paste0(dir, "/data/votes/kw_merge.csv")) %>%
  dplyr::rename(date_attack = date_first_attack) %>%
  mutate(ags = ifelse(nchar(ags) == 7, paste0("0", ags), ags)) %>%
  mutate(date_election = as.Date(date_election, format = "%Y-%m-%d"))

# ADD UNEMPLOYMENT AND LAND USE COVARIATES =====================================

## Data ####
covariates <- read.csv(paste0(dir, "/data/covariates/covariates_empl_land.csv")) %>%
  mutate(ags = as.character(ags_2021)) %>%
  select(-c(ags_2021)) %>%
  mutate(ags = ifelse(nchar(ags) == 7, paste0("0", ags), ags))

## Function to calculate election-period values and to join ####
adding_covars <- function(dt) {

  # Since covariate data are on year basis, decide which years "belong" to an election period:
  # If election after June, this year belongs to election period
  dt_covar_01 <- dt %>%
    as.data.frame() %>%
    # party/percent irrelevant here, so make unique
    select(ags, date_election) %>%
    unique() %>%
    arrange(ags, date_election) %>%
    group_by(ags) %>%
    mutate(date_prev_election = dplyr::lag(date_election)) %>%
    mutate(
      # assign years to election periods with June cutoff
      last_year = ifelse(
        # format(date_election, "%m")
        as.numeric(format(date_election, "%m")) <= 6,
        as.numeric(format(date_election, "%Y")) - 1,
        as.numeric(format(date_election, "%Y"))
      ),
      first_year = ifelse(
        # format(date_prev_election, "%m")
        as.numeric(format(date_prev_election, "%m")) <= 6,
        as.numeric(format(date_prev_election, "%Y")),
        as.numeric(format(date_prev_election, "%Y")) + 1
      )
    ) %>%
    pivot_longer(first_year:last_year) %>%
    rename("year" = value) %>%
    select(-c(name, date_prev_election)) %>%
    arrange(ags, date_election, year) %>%
    filter(!is.na(year))

  # Now that we have the first and last year of an election period,
  # add rows for all years in-between
  # [This takes time to execute!]
  dt_covar_02 <- dt_covar_01 %>%
    filter(!is.na(year)) %>%
    group_by(ags, date_election) %>%
    complete(year = full_seq(min(year):max(year), 1)) %>%
    arrange(ags, date_election, year)

  # Join covariate data and average all years of election period
  dt_covar_03 <- dt_covar_02 %>%
    left_join(., covariates, by = c("ags", "year")) %>%
    group_by(date_election, ags) %>%
    summarize(
      area_total = mean(area_total, na.rm = T),
      area_agri = mean(area_agri, na.rm = T),
      area_heathen = mean(area_heathen, na.rm = T),
      pop_total = mean(pop_total, na.rm = T),
      pop_men = mean(pop_men, na.rm = T),
      pop_women = mean(pop_women, na.rm = T),
      unempl_total = mean(unempl_total, na.rm = T)
    ) %>%
    # some combo of ags/election period don't have data and produce Inf -> remove
    mutate(across(
      c(
        area_total, area_agri, area_heathen,
        pop_total, pop_men, pop_women, unempl_total
      ),
      ~ ifelse(. == -Inf | is.nan(.), NA, .)
    )) %>%
    as.data.frame()

  # Join back to dt
  dt_covar_04 <- dt_covar_03 %>%
    left_join(., dt, by = c("ags", "date_election"))

  # Create proportional measures
  dt_covar_04 <- dt_covar_04 %>%
    mutate(
      prop_agri = area_agri / area_total,
      prop_heathen = area_heathen / area_total,
      prop_unempl = unempl_total / pop_total
    ) %>%
    select(-c(
      area_agri, area_heathen,
      pop_men, pop_women, unempl_total
    ))

  return(dt_covar_04)
}

btw <- adding_covars(btw)
ltw <- adding_covars(ltw)
kw <- adding_covars(kw)

# ADD REFUGEES COVARIATES ======================================================

## Data ####
refugees <- read_csv(paste0(dir, "/data/covariates/covariates_refugees.csv"))
colnames(refugees)[4:147] <- stringr::str_replace_all(
  string = colnames(refugees)[4:147], pattern = " ", repl = ""
) %>%
  gsub("\\.", "_", .)

## Calculate yearly averages ####
refugees <- refugees %>%
  dplyr::select(-c(Gemeinde, Land)) %>%
  dplyr::mutate_at(vars(-one_of("AGS")), as.numeric) %>%
  tidyr::pivot_longer(
    cols = -one_of("AGS"),
    names_to = "t",
    names_prefix = "m",
    values_to = "refugees",
    names_transform = list(t = as.character),
    values_transform = list(refugee = as.numeric)
  ) %>%
  dplyr::mutate(t = str_split(t, "_", simplify = TRUE)[, 2]) %>%
  dplyr::mutate(t = paste0(20, t)) %>%
  dplyr::group_by(AGS, t) %>%
  dplyr::summarise(refugees = mean(refugees, na.rm = TRUE)) %>%
  dplyr::rename(
    year = t,
    ags = AGS
  ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    ags = ifelse(nchar(ags) == 7, paste0("0", ags), ags),
    year = as.numeric(year)
  )
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# NEW 
missing_years <- expand.grid(2007:2022, unique(refugees$ags)) %>%
  rename("year" = Var1, "ags" = Var2)
refugees <- refugees %>%
  mutate(refugees = ifelse(is.na(refugees), 0, refugees)) %>%
  right_join(., missing_years, by = c("ags", "year")) %>%
  arrange(ags, year) %>%
  fill(refugees, .direction = "down")
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

## Join ####

btw <- btw %>%
  dplyr::mutate(year = lubridate::year(date_election)) %>%
  dplyr::left_join(., refugees, by = c("ags", "year"))

ltw <- ltw %>%
  dplyr::mutate(year = lubridate::year(date_election)) %>%
  dplyr::left_join(., refugees, by = c("ags", "year"))

kw <- kw %>%
  dplyr::mutate(year = lubridate::year(date_election)) %>%
  dplyr::left_join(., refugees, by = c("ags", "year"))

data <- list(btw = btw, ltw = ltw, kw = kw)

rm(btw, ltw, kw, refugees, covariates)

# PREPARE DATA =================================================================

missing_states <- c(13, 7)

for (i in 1:length(data)) {

  ## Impute missing vote percentages ####
  data_i <- data[[i]] %>%
    dplyr::group_by(date_election, party) %>%
    dplyr::mutate(
      percent = ifelse(is.na(percent), mean(percent, na.rm = TRUE), percent),
      ags = as.numeric(ags)
    ) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(election_after_attack = ifelse(
      date_election > date_attack, "yes", "no"
    )) %>%
    dplyr::mutate(first_treatment_year = ifelse(
      election_after_attack == "yes", as.character(date_election), NA
    ))

  ## Generate first-treatment-year variable ####
  agg <- data_i %>%
    dplyr::group_by(ags) %>%
    dplyr::summarise(first_treatment_year = if (all(is.na(first_treatment_year))) NA else min(first_treatment_year, na.rm = TRUE))

  ## Merge first-treatment-year variable to data ####
  data_i <- data_i %>%
    dplyr::left_join(., agg, by = "ags") %>%
    dplyr::rename(first_treatment_year = first_treatment_year.y) %>%
    dplyr::select(-c(first_treatment_year.x, election_after_attack)) %>%
    dplyr::mutate(treat_cont = ifelse(date_election >= first_treatment_year, 1, 0))

  ## Generate integer variables for election date ####
  data_i <- data_i %>%
    dplyr::mutate(
      date_election = as.character(date_election),
      first_treatment_year = as.character(first_treatment_year)
    ) %>%
    dplyr::group_by(ags) %>%
    dplyr::arrange(date_election) %>%
    dplyr::mutate(election_date_int = as.numeric(as.factor(date_election))) %>%
    dplyr::ungroup()

  ## Generate integer variable for first treatment date and generate time to treatment variable ####
  data_i <- data_i %>%
    dplyr::group_by(ags, party) %>%
    dplyr::mutate(first_treatment_date_int = election_date_int[date_election == first_treatment_year]) %>%
    dplyr::mutate(time_to_treat = election_date_int - first_treatment_date_int)

  ## Format time to treatment and first treatment date variables
  data_i <- data_i %>%
    dplyr::mutate(
      first_treatment_date_csa = first_treatment_date_int,
      first_treatment_date_twfe = first_treatment_date_int,
      time_to_treat_twfe = time_to_treat
    )


  # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  # OLD: 
  # data_i$treat_cont[is.na(data_i$treat_cont)] <- 0
  # data_i$first_treatment_date_csa[is.na(data_i$first_treatment_date_csa)] <- 0
  # data_i$first_treatment_date_twfe[is.na(data_i$first_treatment_date_twfe)] <- 10000
  # data_i$time_to_treat_twfe[is.na(data_i$time_to_treat_twfe)] <- -10000
  # data_i$time_to_treat[is.na(data_i$time_to_treat)] <- 0
  # NEW: code as NA if in states 
  data_i <- data_i %>%
    mutate(
      treat_cont = 
        ifelse(is.na(treat_cont) & !(state %in% missing_states), 0, treat_cont),
      first_treatment_date_csa = ifelse(
        is.na(first_treatment_date_csa) & !(state %in% missing_states), 0, first_treatment_date_csa),
      first_treatment_date_twfe = ifelse(
        is.na(first_treatment_date_twfe) & !(state %in% missing_states), 0, first_treatment_date_twfe),
      time_to_treat_twfe = ifelse(
        is.na(time_to_treat_twfe) & !(state %in% missing_states), 0, time_to_treat_twfe),
      time_to_treat = ifelse(
        is.na(time_to_treat) & !(state %in% missing_states), 0, time_to_treat))
  # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

  data[[i]] <- data_i
}

rm(agg, data_i, i, missing_states)

## Compute lagged treatments for TWFE ####

for (i in 1:length(data)) {
  data[[i]] <- data[[i]] %>%
    dplyr::group_by(party) %>%
    dplyr::group_split() %>%
    lapply(., function(x) {
      x %>%
        dplyr::ungroup() %>%
        dplyr::arrange(ags) %>%
        dplyr::group_by(ags) %>%
        dplyr::arrange(election_date_int, .by_group = TRUE) %>%
        dplyr::ungroup() %>%
        dplyr::group_by(ags) %>%
        dplyr::mutate(
          treat_cont_lag_1 = dplyr::lag(treat_cont, n = 1),
          treat_cont_lag_2 = dplyr::lag(treat_cont, n = 2)
        )
    }) %>%
    dplyr::bind_rows()
}

rm(i)

# SAVE DATA AS LIST ============================================================

data_processed <- data
save(data_processed, file = paste0(dir, "/data/data_processed.RData"))
